Where the Bugs are¶
This chapter is work in progress ("beta"). It is incomplete and may change at any time.
Brief abstract/introduction/motivation. State what the chapter is about in 1-2 paragraphs. Then, have an introduction video:
from bookutils import YouTubeVideo
# YouTubeVideo("w4u5gCgPlmg")
Prerequisites
- Refer to earlier chapters as notebooks here, as here: Earlier Chapter.
import bookutils
Synopsis¶
To use the code provided in this chapter, write
>>> from debuggingbook.ChangeExplorer import <identifier>
and then make use of the following features.
For those only interested in using the code in this chapter (without wanting to know how it works), give an example. This will be copied to the beginning of the chapter (before the first section) as text with rendered input and output.
Mining Change Histories¶
We can mine past changes.
from pydriller import RepositoryMining # https://pydriller.readthedocs.io/
import pickle
REPO = 'https://github.com/uds-se/debuggingbook.git'
# REPO = '..'
class ChangeCounter:
def __init__(self, repo, filter=None, log=False, **kwargs):
self.repo = repo
self.log = log
if filter is None:
filter = lambda m: True
self.filter = filter
self.changes = {}
self.messages = {}
self.sizes = {}
self.hashes = set()
self.mine(**kwargs)
class ChangeCounter(ChangeCounter):
def mine(self, **kwargs):
miner = RepositoryMining(self.repo, **kwargs)
for commit in miner.traverse_commits():
for m in commit.modifications:
m.hash = commit.hash
m.committer = commit.committer
m.committer_date = commit.committer_date
m.msg = commit.msg
if self.filter(m):
self.update_stats(m)
class ChangeCounter(ChangeCounter):
def update_stats(self, m):
if not m.new_path:
return
node = tuple(m.new_path.split('/'))
if m.hash not in self.hashes:
self.hashes.add(m.hash)
self.update_size(node, len(m.source_code) if m.source_code else 0)
self.update_changes(node, m.msg)
self.update_elems(node, m)
def update_size(self, node, size):
self.sizes[node] = size
def update_changes(self, node, msg):
self.changes.setdefault(node, 0)
self.messages.setdefault(node, [])
self.changes[node] += 1
self.messages[node].append(msg)
def update_elems(self, node, m):
pass
from Timer import Timer
with Timer() as t:
change_counter = ChangeCounter(REPO)
t.elapsed_time()
list(change_counter.changes.keys())[:10]
change_counter.changes[('Chapters.makefile',)]
change_counter.messages[('Chapters.makefile',)]
change_counter.sizes[('Chapters.makefile',)]
Past Changes¶
Now, visualize treemap (notebooks/) and give it
- a size, depending on the number of lines;
- a color, depending on the total number of changes.
import easyplotly as ep
import plotly.graph_objects as go
import math
class ChangeCounter(ChangeCounter):
def map_node_sizes(self):
# Default: use log scale
return {node: math.log(self.sizes[node]) if self.sizes[node] else 0
for node in self.sizes}
# Alternative: use sqrt size
return {node: math.sqrt(self.sizes[node]) for node in self.sizes}
# Alternative: use absolute size
return self.sizes
def map_node_color(self, node):
if node and node in self.changes:
return self.changes[node]
return None
def map_node_text(self, node):
if node and node in self.changes:
return self.changes[node]
return None
def map_hoverinfo(self):
return 'label+text'
def map_colorscale(self):
return 'YlOrRd'
def map(self):
treemap = ep.Treemap(
self.map_node_sizes(),
text=self.map_node_text,
hoverinfo=self.map_hoverinfo(),
marker_colors=self.map_node_color,
marker_colorscale=self.map_colorscale(),
root_label=self.repo,
branchvalues='total'
)
fig = go.Figure(treemap)
fig.update_layout(margin=dict(l=0, r=0, t=30, b=0))
return fig
change_counter = ChangeCounter(REPO)
change_counter.map()
Past Fixes¶
class FixCounter(ChangeCounter):
def __init__(self, repo, **kwargs):
super().__init__(repo, filter=self.is_fix, **kwargs)
def is_fix(self, m):
return m.msg.startswith("Fix:") and not "show output" in m.msg
class FixCounter(ChangeCounter):
def map_node_text(self, node):
if node and node in self.messages:
return "<br>".join(self.messages[node])
return ""
def map_hoverinfo(self):
return 'label'
fix_counter = FixCounter(REPO)
fix_counter.map()
Fine-Grained Changes¶
Our aim: get tuples that are go beyond just files.
- For each file, get a listing of which elements are in which lines.
- Then, for each diff, find out which elems are affected.
Mapping Elements to Locations¶
import re
import magic # https://github.com/ahupp/python-magic
magic.from_buffer('''
#include <stdio.h>
int main(int argc, char *argv[]) {
printf("Hello, world!\n")
}
''')
DELIMITERS = [
(
# Python
re.compile(r'^Python.*'),
# Beginning of element
re.compile(r'^(async\s+)?(def|class)\s+(?P<name>\w+)\W.*'),
# End of element
re.compile(r'^[^#\s]')
),
(
# Jupyter Notebooks
re.compile(r'^(JSON|exported SGML|Jupyter).*'),
re.compile(r'^\s+"(async\s+)?(def|class)\s+(?P<name>\w+)\W'),
re.compile(r'^(\s+"[^#\s\\]|\s+\])')
),
(
# C source code
re.compile(r'^C source.*'),
re.compile(r'^[^\s].*\s+(?P<name>\w+)\s*[({].*'),
re.compile(r'^[}]')
),
(
# Catchall (like C; should work for any language using { } as delimiters)
re.compile(r'^.*'),
re.compile(r'^[^\s].*\s+(?P<name>\w+)\s*[({].*'),
re.compile(r'^[}]')
)
]
def rxdelim(s):
tp = magic.from_buffer(s)
for rxtp, rxbegin, rxend in DELIMITERS:
if rxtp.match(tp):
return rxbegin, rxend
return None, None
def elem_mapping(s, log=False):
rxbegin, rxend = rxdelim(s)
if rxbegin is None:
return []
mapping = [None]
current_elem = None
lineno = 0
for line in s.split('\n'):
lineno += 1
match = rxbegin.match(line)
if match:
current_elem = match.group('name')
elif rxend.match(line):
current_elem = None
mapping.append(current_elem)
if log:
print(f"{lineno:3} {current_elem}\t{line}")
return mapping
some_c_source = """
#include <stdio.h>
int foo(int x) {
return x;
}
struct bar {
int x, y;
}
int main(int argc, char *argv[]) {
return foo(argc);
}
"""
some_c_mapping = elem_mapping(some_c_source, log=True)
some_python_source = """
def foo(x):
return x
class bar(blue):
x = 25
def f(x):
return 26
def main(argc):
return foo(argc)
"""
some_python_mapping = elem_mapping(some_python_source, log=True)
# some_jupyter_source = open("Slicer.ipynb").read()
# some_jupyter_mapping = elem_mapping(some_jupyter_source, log=False)
Determining Changed Elements¶
from ChangeDebugger import diff, patch, print_patch
from diff_match_patch import diff_match_patch
class FineChangeCounter(ChangeCounter):
def changed_elems(self, mapping, start, length=0):
elems = set()
for line in range(start, start + length + 1):
if line < len(mapping) and mapping[line]:
elems.add(mapping[line])
return elems
def elem_size(self, elem, mapping, source):
source_lines = [''] + source.split('\n')
size = 0
for line_no in range(len(mapping)):
if mapping[line_no] == elem:
size += len(source_lines[line_no])
return size
fine_change_counter = FineChangeCounter(REPO)
assert fine_change_counter.changed_elems(some_python_mapping, 4) == {'foo'}
assert fine_change_counter.changed_elems(some_python_mapping, 4, 1) == {'foo', 'bar'}
assert fine_change_counter.changed_elems(some_python_mapping, 10, 2) == {'main'}
class FineChangeCounter(FineChangeCounter):
def update_elems(self, node, m):
old_source = m.source_code_before if m.source_code_before else ""
new_source = m.source_code if m.source_code else ""
patches = diff(old_source, new_source)
old_mapping = elem_mapping(old_source)
new_mapping = elem_mapping(new_source)
elems = set()
for patch in patches:
old_start_line = patch.start1 + 1
new_start_line = patch.start2 + 1
for (op, data) in patch.diffs:
data_length = data.count('\n')
if op == diff_match_patch.DIFF_INSERT:
elems |= self.changed_elems(old_mapping, old_start_line)
elems |= self.changed_elems(new_mapping, new_start_line,
data_length)
elif op == diff_match_patch.DIFF_DELETE:
elems |= self.changed_elems(old_mapping, old_start_line,
data_length)
elems |= self.changed_elems(new_mapping, new_start_line)
old_start_line += data_length
new_start_line += data_length
for elem in elems:
elem_node = node + (elem,)
self.update_size(elem_node,
self.elem_size(elem, new_mapping, new_source))
self.update_changes(elem_node, m.msg)
with Timer() as t:
fine_change_counter = FineChangeCounter(REPO)
t.elapsed_time()
fine_change_counter.map()
Lessons Learned¶
- Lesson one
- Lesson two
- Lesson three
Next Steps¶
Link to subsequent chapters (notebooks) here, as in:
Background¶
Cite relevant works in the literature and put them into context, as in:
The idea of ensuring that each expansion in the grammar is used at least once goes back to Burkhardt [Burkhardt et al, 1967.], to be later rediscovered by Paul Purdom [Purdom et al, 1972.].
Exercises¶
Close the chapter with a few exercises such that people have things to do. To make the solutions hidden (to be revealed by the user), have them start with
**Solution.**
Your solution can then extend up to the next title (i.e., any markdown cell starting with #).
Running make metadata will automatically add metadata to the cells such that the cells will be hidden by default, and can be uncovered by the user. The button will be introduced above the solution.
Exercise 1: Title¶
Text of the exercise
# Some code that is part of the exercise
pass
Some more text for the exercise
The content of this project is licensed under the
Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
The source code that is part of the content, as well as the source code used to format and display that content is licensed under the MIT License.
Last change: 2021-01-10 13:21:53+01:00 •
Cite •
Imprint